import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import scale
import pandas as pd
import seaborn as sns
import numpy as np
< -1 or > 1 highly skewed
between -1 and -.5 or .5 and 1 distribution is moderately skewed
between -.5 and .5 distribution is approximately symmetric
Diabetes=pd.read_csv("diabetes.csv")
Diabetes.describe()
It is evident that the data has anomalies and data has skewness.:
1) If we see the all 4 quartiles of pregnancy , there is huge deviation between 75% and maximum, hence there lies an anomaly in the variable. Such pattern exist in other variables
2) Variables like Glucose, BP, SkinThickness, Insulin, BMI can never be zero, hence it is an anomaly , in these variable there are lower and upper limit outliers , which shall be treated as per industry standards
Diabetes.shape # Shape of data is R-768 C-9
Diabetes.info()
cols=['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction']
Diabetes[~(Diabetes[cols]==0).any(axis=1)].shape
pd.pivot_table?
pd.crosstab?
x=pd.crosstab(index=Diabetes.Outcome,columns=Diabetes.Pregnancies)
x.to_csv("p.csv")
x=pd.DataFrame(x)
x.cumsum(axis=1)
x.
x[x.index==1]=0
x
Diabetes.groupby(['Outcome']).agg(['mean','std'])
sns.boxplot(Diabetes.Glucose)
l = Diabetes.columns.values
number_of_columns=9
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
plt.subplot(number_of_rows + 1,number_of_columns,i+1)
sns.set_style('whitegrid')
sns.boxplot(Diabetes[l[i]],color='green',orient='v')
plt.tight_layout()
l=Diabetes.iloc[:,0:8].columns.values
def outliertreat(x):
Perc_01=x.quantile(.1)
Perc_99=x.quantile(.99)
Perc_25=x.quantile(.25)
Perc_75=x.quantile(.75)
IQR=Perc_75 - Perc_25
Lower_Lim=Perc_25 - 1.5 * IQR
Upper_Lim=Perc_75 + 1.5 * IQR
minimum=x.min()
maximum=x.max()
return pd.Series([Perc_01,Perc_99,Perc_25,Perc_75,IQR,Lower_Lim,Upper_Lim,minimum,maximum],
index=["Perc_01","Perc_99","Perc_25","Perc_75","IQR",
"Lower_Lim","Upper_Lim","minimum","maximum"])
Diabetes[l].apply(lambda x: outliertreat(x)).T
Diabetes.Pregnancies.clip_upper(13,inplace=True)
Diabetes.Glucose.clip_lower(37.125,inplace=True)
Diabetes.BloodPressure.clip_lower(35,inplace=True)
Diabetes.BloodPressure.clip_upper(107,inplace=True)
Diabetes.SkinThickness.clip_upper(80,inplace=True)
Diabetes.Insulin.clip_upper(318.125,inplace=True)
Diabetes.BMI.clip_lower(13.35,inplace=True)
Diabetes.BMI.clip_upper(50.55,inplace=True)
Diabetes.DiabetesPedigreeFunction.clip_upper(1.200,inplace=True)
Diabetes.Age.clip_upper(67,inplace=True)
l = Diabetes.columns.values
number_of_columns=9
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
plt.subplot(number_of_rows + 1,number_of_columns,i+1)
sns.set_style('whitegrid')
sns.boxplot(Diabetes[l[i]],color='green',orient='v')
plt.tight_layout()
Diabetes.groupby(['Outcome']).agg(['mean','std'])
Diabetes.groupby(['Outcome','Pregnancies']).agg(['mean'])
Diabetes[(Diabetes.Outcome==1) & (Diabetes.Pregnancies==12)]
sns.distplot(Diabetes.Glucose)
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.distplot,'Glucose')
Diabetes.Glucose.skew()
Diabetes.Glucose[Diabetes.Outcome==1].shape
Perc_concentration_diabetic_women=266*100/268
Diabetes.Glucose[(Diabetes.Outcome==0) & (Diabetes.Glucose>57.6) & (Diabetes.Glucose <162.27)].shape
Diabetes.Glucose[Diabetes.Outcome==0].shape
Perc_concentration_non_diabetic_women=477*100/500
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.boxplot,'Glucose')
sns.boxplot(Diabetes.Glucose)
iqr=Diabetes.Glucose.quantile(.75)-Diabetes.Glucose.quantile(.25)
Lower_ND_Cut_G=Diabetes.Glucose.quantile(.25)- 1.5 * iqr
Lower_ND_Cut_G
Diabetes.Glucose.quantile(.10)
Diabetes.Glucose.quantile(.01)
Since 1% of Glucose data is 57 and below so i would consider 37.125 as the lower limit for outlier treatment
Diabetes.Glucose.clip_lower(Lower_ND_Cut_G,inplace=True)
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(plt.hist,'Glucose')
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.boxplot,'Glucose')
sns.swarmplot(Diabetes.Outcome,Diabetes.Glucose)
mean_Glucose_Level_non_Diabetic=Diabetes.Glucose[Diabetes.Outcome==0].mean()
sd_Glucose_Level_non_Diabetic=Diabetes.Glucose[Diabetes.Outcome==0].std()
mean_Glucose_Level_Diabetic=Diabetes.Glucose[Diabetes.Outcome==1].mean()
sd_Glucose_Level_Diabetic=Diabetes.Glucose[Diabetes.Outcome==1].std()
1) Women who are non diabetic have on an average 3 children with standard deviation of 3 , which means 94.4% of females are having number of babies between 0 to 9
2) Women who are diabetic have on an average 5 children with standard deviation of 4, which means 97% of females are having number of babies between 0 to 12
3) This study doesnt state that diabetes have huge or severe complication in delivery of child
4) Pregnancy variable has moderate positive skewness , but when we bifercate it as per outcome variable then it is highly positive skewed when outcome variable is 0 else it is approximately symmetric
5) For non diabetic women , women having pregnancy above 11 is a rare
sns.distplot(Diabetes.Pregnancies)
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(plt.hist,'Pregnancies')
sns.boxplot(Diabetes.Pregnancies)
IQR_P=Diabetes.Pregnancies.quantile(.75)-Diabetes.Pregnancies.quantile(.25)
U_Cut_P=Diabetes.Pregnancies.quantile(.75)+1.5*IQR_P
Diabetes.Pregnancies.clip_upper(U_Cut_P,inplace=True)
Diabetes.Pregnancies.skew()
Diabetes.Pregnancies[Diabetes.Outcome==0].skew()
Diabetes.Pregnancies[Diabetes.Outcome==1].skew()
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.distplot,'Pregnancies')
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.boxplot,'Pregnancies')
mean_Preg_non_Diabetic=Diabetes.Pregnancies[Diabetes.Outcome==0].mean()
sd_Preg_non_Diabetic=Diabetes.Pregnancies[Diabetes.Outcome==0].std()
mean_Preg_Diabetic=Diabetes.Pregnancies[Diabetes.Outcome==1].mean()
sd_Preg_Diabetic=Diabetes.Pregnancies[Diabetes.Outcome==1].std()
sns.swarmplot(Diabetes.Outcome,Diabetes.Pregnancies)
Diabetes.Pregnancies[(Diabetes.Outcome==0)&(Diabetes.Pregnancies>=0)&(Diabetes.Pregnancies<10)].shape
Diabetes.Pregnancies[Diabetes.Outcome==0].shape
Perc_Concentration_Non_Diabetic=472/5
Perc_Concentration_Non_Diabetic
Diabetes.Pregnancies[(Diabetes.Outcome==1)&(Diabetes.Pregnancies>=0)&(Diabetes.Pregnancies<13)].shape
Diabetes.Pregnancies[Diabetes.Outcome==1].shape
Perc_Concentration_Diabetic=259*100/268
Perc_Concentration_Diabetic
1) Average Blood Pressure of Non-Diabetic Women is 69.5 with standard deviation of 13.54 , which implies 94% of women blood pressure lies between 42.4 and 96.6. Majority of women have blood pressure level in between 60 to 80
2) Average Blood Pressure of Diabetic Women is 72.9 with standard deviation of 15.11 , which implies 91% of women blood pressure lies between 42.6 and 103.1. Majority of women have BP in between 60 to 95
3) It implies that people having diabetes can have there blood pressure level in control which is between 80-120 as ideal BP level is 80-120
4) Blood Pressure Variable is approximately symmetrical, but if we bifercate it as per outcome, it is approximately symmetric for outcome = 0 , else it is highly negative skewed
5) There are very less number of non diabetic women whose BP is less than 40 and greater than 100.
6) There are very less number of diabetic women whose BP is less than 50 and greater than 100
sns.boxplot(Diabetes.BloodPressure)
IQR_BP=Diabetes.BloodPressure.quantile(.75)-Diabetes.BloodPressure.quantile(.25)
Lower_Cut=Diabetes.BloodPressure.quantile(.25)-(1.5*IQR_BP)
Lower_Cut
Diabetes.BloodPressure.quantile(.10)
Diabetes.BloodPressure.quantile(.05)
U_Cut_BP=Diabetes.BloodPressure.quantile(.75)+(1.5*IQR_BP)
U_Cut_BP
Diabetes.BloodPressure.quantile(.75)
Diabetes.BloodPressure.quantile(.90)
Diabetes.BloodPressure.quantile(.99)
Diabetes.BloodPressure.min()
Diabetes.BloodPressure.max()
Since 99% of Blood pressure of women is 106 and below and 1% of blood pressure is 0 , we will consider the upper and lower limit to be decided from IQR
Diabetes.BloodPressure.clip_lower(Lower_Cut,inplace=True)
Diabetes.BloodPressure.clip_upper(U_Cut_BP,inplace=True)
sns.distplot(Diabetes.BloodPressure)
Diabetes.BloodPressure.skew()
Diabetes.BloodPressure[Diabetes.Outcome==0].skew()
Diabetes.BloodPressure[Diabetes.Outcome==1].skew()
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.distplot,'BloodPressure')
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(plt.hist,'BloodPressure')
mean_BP_non_Diabetic=Diabetes.BloodPressure[Diabetes.Outcome==0].mean()
sd_BP_non_Diabetic=Diabetes.BloodPressure[Diabetes.Outcome==0].std()
mean_BP_Diabetic=Diabetes.BloodPressure[Diabetes.Outcome==1].mean()
sd_BP_Diabetic=Diabetes.BloodPressure[Diabetes.Outcome==1].std()
Diabetes.BloodPressure[(Diabetes.Outcome==0)&(Diabetes.BloodPressure>42.39)&(Diabetes.BloodPressure<96.7)].shape
Diabetes.BloodPressure[(Diabetes.Outcome==0)].shape
Perc_Conentration_Non_Diabetic=470/5
Perc_Conentration_Non_Diabetic
Diabetes.BloodPressure[(Diabetes.Outcome==1)&(Diabetes.BloodPressure>42.59)&(Diabetes.BloodPressure<103.2)].shape
Diabetes.BloodPressure[(Diabetes.Outcome==1)].shape
Perc_Conentration_Diabetic=243*100/268
Perc_Conentration_Diabetic
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.boxplot,'BloodPressure')
sns.swarmplot(Diabetes.Outcome,Diabetes.BloodPressure)
1) Mean skinfold thickness value of Non-Diabetic woman is 21.9 with standard deviation of 11.9, which states 66.8% of women have skinfold thickness of triceps between 9.3 and 42.6
2) Mean skinfold thickness value of Diabetic woman is 29.3 with standard deviation of 8.22, which states 60%% of women have skinfold thickness of triceps between 12.9 and 45.8
3) Hence SkinFold Thickness of Tricep is a good but not reliable measure of classifying in diabetic and non diabetic
4) Skinfold Thickness of a person can be never 0 hence we can consider it as missing value because data has significant amount of 0 so we have done imputation using median since it wasnt a normal distributed data
5) Skinfold Thickness variable after imputation and outlier treatment is approximately symmetric but when we bifercate it with respect to outcome variable , for non diabetic women it is moderately positive skewed, where as for diabetic women it is approximately symmetric
6) There are very less non diabetic women who have thickness above 41
7) It is a moderate skewed variable
sns.boxplot(Diabetes.SkinThickness)
IQR_SKFT=Diabetes.SkinThickness.quantile(.75)-Diabetes.SkinThickness.quantile(.25)
ucut=Diabetes.SkinThickness.quantile(.75)+1.5*IQR_SKFT
lcut=Diabetes.SkinThickness.quantile(.25)-1.5*IQR_SKFT
ucut
lcut
Diabetes.SkinThickness.quantile(.99)
Diabetes.SkinThickness.quantile(.30)
Diabetes.SkinThickness.max()
Since 99% of women having skinthickness of 51.33 and below and there are 30% of women are having 8.2 and below , its peculiar that 29% of women have value of skinthickness as 0
so here we will use upper(99%) and lower(30%)[ since value of skinthickness can not be 0] percentiles to deal with outlier
Diabetes.SkinThickness.clip_upper(Diabetes.SkinThickness.quantile(.99),inplace=True)
Diabetes.SkinThickness.clip_lower(Diabetes.SkinThickness.quantile(.30),inplace=True)
sns.distplot(Diabetes.SkinThickness)
Diabetes.SkinThickness.skew()
Diabetes.SkinThickness[Diabetes.Outcome==0].skew()
Diabetes.SkinThickness[Diabetes.Outcome==1].skew()
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.distplot,'SkinThickness')
mean_SKFT_Level_non_Diabetic=Diabetes.SkinThickness[Diabetes.Outcome==0].mean()
sd_SKFT_Level_non_Diabetic=Diabetes.SkinThickness[Diabetes.Outcome==0].std()
mean_SKFT_Level_Diabetic=Diabetes.SkinThickness[Diabetes.Outcome==1].mean()
sd_SKFT_Level_Diabetic=Diabetes.SkinThickness[Diabetes.Outcome==1].std()
mean_SKFT_Level_non_Diabetic + 2* sd_SKFT_Level_non_Diabetic
Diabetes.SkinThickness[(Diabetes.Outcome==0)&(Diabetes.SkinThickness>0)&(Diabetes.SkinThickness<45.9)].shape
Perc_Concentration_Non_Diabetic=485/5
Perc_Concentration_Non_Diabetic
Diabetes.SkinThickness[(Diabetes.Outcome==1)&(Diabetes.SkinThickness>12.9)&(Diabetes.SkinThickness<45.81)].shape
Perc_Concentration_Diabetic=162*100/268
Perc_Concentration_Diabetic
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.boxplot,'SkinThickness')
sns.swarmplot(Diabetes.Outcome,Diabetes.SkinThickness)
1) Mean insulin level of non diabetic women is 77.9 and standard deviation is 68.4 , which means 93.2% of women insuline level lies between 0 to 214.6
2) Mean insulin level of diabetic women is 101.5 with standard deviation of 87.3, which states 100% of women have insulin level between 0 to 276.1
3) Type 1 Diabetic patients have low insulin levels but low insulin levels can be associated with other dieseases also like Hypopituitarism, pancreatic cancer or pancreatic disease. High insulin levels are generally seen in Type 2 Diabetes , but patients who are not diabetic can have high insulin level because of other health issues like hyperinsulinemia , obessity , heart disease, use of drugs, etc, so non diabetic women have high insulin because of other underlying factors
4) Insulin is highly positive skewed data , but when we bifercate it with respect to outcome variable , it is highly positive skewed for non diabetic women and moderately positive skewed data for diabetic
5) It is very rare for non diabetic women having insulin levels above 200
sns.boxplot(Diabetes.Insulin)
IQR_I=Diabetes.Insulin.quantile(.75)-Diabetes.Insulin.quantile(.25)
ucut=Diabetes.Insulin.quantile(.75) + 1.5 * IQR_I
Diabetes.Insulin.clip_upper(ucut,inplace=True)
sns.distplot(Diabetes.Insulin)
Diabetes.Insulin.skew()
Diabetes.Insulin[Diabetes.Outcome==0].skew()
Diabetes.Insulin[Diabetes.Outcome==1].skew()
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.distplot,'Insulin')
mean_insulin_Level_non_Diabetic=Diabetes.Insulin[Diabetes.Outcome==0].mean()
std_insulin_Level_non_Diabetic=Diabetes.Insulin[Diabetes.Outcome==0].std()
mean_insulin_Level_Diabetic=Diabetes.Insulin[Diabetes.Outcome==1].mean()
std_insulin_Level_Diabetic=Diabetes.Insulin[Diabetes.Outcome==1].std()
Diabetes.Insulin[(Diabetes.Outcome==0)&(Diabetes.Insulin>0)&(Diabetes.Insulin<214.61)].shape
Perc_Concentration_Non_Diabetic=466/5
Diabetes.Insulin[(Diabetes.Outcome==1)&(Diabetes.Insulin>0)&(Diabetes.Insulin<276.2)].shape
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.boxplot,'Insulin')
sns.swarmplot(Diabetes.Outcome,Diabetes.Insulin)
1) Mean BMI level of non diabetic women is 30.5 and standard deviation is 6.8 , which means 95% of women BMI lies between 16.8 to 44.2
2) Mean BMI of diabetic women is 35.1 with standard deviation of 6.4, which states 94.4% of women have BMI between 22.2 to 47.9
3) Diabetic women are majorly concentrated on high BMI level
4) BMI variable is approximately symmetric but when we bifercate with respect to outcome, Non Diabetic women BMI is approximately symmetric whereas for Diabetic women it is the same
5) It is very rare that BMI of non diabetic women is greater than 49 and BMI of Diabetic women less than 23
sns.boxplot(Diabetes.BMI)
IQR_BMI=Diabetes.BMI.quantile(.75)-Diabetes.BMI.quantile(.25)
lcut=Diabetes.BMI.quantile(.25)-1.5*IQR_BMI
ucut=Diabetes.BMI.quantile(.75)+1.5*IQR_BMI
Diabetes.BMI.clip_lower(lcut,inplace=True)
Diabetes.BMI.clip_upper(ucut,inplace=True)
sns.distplot(Diabetes.BMI)
Diabetes.BMI.skew()
Diabetes.BMI[Diabetes.Outcome==0].skew()
Diabetes.BMI[Diabetes.Outcome==1].skew()
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.distplot,'BMI')
mean_BMI_Level_non_Diabetic=Diabetes.BMI[Diabetes.Outcome==0].mean()
sd_BMI_Level_non_Diabetic=Diabetes.BMI[Diabetes.Outcome==0].std()
mean_BMI_Level_Diabetic=Diabetes.BMI[Diabetes.Outcome==1].mean()
sd_BMI_Level_Diabetic=Diabetes.BMI[Diabetes.Outcome==1].std()
Diabetes.BMI[(Diabetes.Outcome==0)&(Diabetes.BMI>16.79)&(Diabetes.BMI<44.21)].shape
Perc_concentration_non_diabetic=475*100/500
Diabetes.BMI[(Diabetes.Outcome==1)&(Diabetes.BMI>22.19)&(Diabetes.BMI<47.9)].shape
Perc_concentration_non_diabetic=253*100/268
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.boxplot,'BMI')
sns.swarmplot(Diabetes.Outcome,Diabetes.BMI)
1) Mean DPF level of non diabetic women is .42 and standard deviation is .26 , which means 94.8% of women DPF level lies between 0 to .95
2) Mean DPF level of diabetic women is .53 with standard deviation of .31, which states 92.5% of women have insulin level between 0 to 1.15
3) It is very rare for non diabetic women to have DPF more than 1
4) Variable is highly positive skewed , but when we bifercate it with respect to outcome, there is high positive skewness for non diabetic women and for diabetic women there is moderate positive skewness
sns.boxplot(Diabetes.DiabetesPedigreeFunction)
IQR_DPF=Diabetes.DiabetesPedigreeFunction.quantile(.75)-Diabetes.DiabetesPedigreeFunction.quantile(.25)
ucut=Diabetes.DiabetesPedigreeFunction.quantile(.75)+1.5*IQR_DPF
Diabetes.DiabetesPedigreeFunction.clip_upper(ucut,inplace=True)
sns.distplot(Diabetes.DiabetesPedigreeFunction)
Diabetes.DiabetesPedigreeFunction.skew()
Diabetes.DiabetesPedigreeFunction[Diabetes.Outcome==0].skew()
Diabetes.DiabetesPedigreeFunction[Diabetes.Outcome==1].skew()
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.distplot,'DiabetesPedigreeFunction')
mean_DPF_Level_non_Diabetic=Diabetes.DiabetesPedigreeFunction[Diabetes.Outcome==0].mean()
sd_DPF_Level_non_Diabetic=Diabetes.DiabetesPedigreeFunction[Diabetes.Outcome==0].std()
mean_DPF_Level_Diabetic=Diabetes.DiabetesPedigreeFunction[Diabetes.Outcome==1].mean()
sd_DPF_Level_Diabetic=Diabetes.DiabetesPedigreeFunction[Diabetes.Outcome==1].std()
Diabetes.DiabetesPedigreeFunction[(Diabetes.Outcome==0)&(Diabetes.DiabetesPedigreeFunction>0)&(Diabetes.DiabetesPedigreeFunction<.96)].shape
perc_concentration_non_diabetic=474*100/500
Diabetes.DiabetesPedigreeFunction[(Diabetes.Outcome==1)&(Diabetes.DiabetesPedigreeFunction>0)&(Diabetes.DiabetesPedigreeFunction<1.16)].shape
perc_concentration_diabetic=248*100/268
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.boxplot,'DiabetesPedigreeFunction')
sns.swarmplot(Diabetes.Outcome,Diabetes.DiabetesPedigreeFunction)
1) Mean age of non-diabetic women is 31 and standard deviation is 11.4 , hence 92.8% of non diabetic women are between age of 8 to 54 years
2) Mean age of diabetic women is 37 and standard deviation is 10.9 , hence 96.6% of non diabetic women are between age of 15 to 59 years
3) Probablity of women between age of 20 to 30 will be non diabetic is .78 that means only 22% of women in this age range develops Diabetes
4) Probability or chances of having diabetes increases after age of 30
5) Age variable is highly positive skewed but when we bifercate it as per outcome variable , it remains same for non diabetic women , but it becomes moderate positive skewed for diabetic women, that means women having diabetes are more on the upper age side
6) There is a rare chance for women above age of 58 to be non diabetic
sns.boxplot(Diabetes.Age)
IQR_A=Diabetes.Age.quantile(.75)-Diabetes.Age.quantile(.25)
ucut=Diabetes.Age.quantile(.75)+1.5*IQR_A
Diabetes.Age.clip_upper(ucut,inplace=True)
sns.distplot(Diabetes.Age)
Diabetes.Age.skew()
Diabetes.Age[Diabetes.Outcome==0].skew()
Diabetes.Age[Diabetes.Outcome==1].skew()
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.distplot,'Age')
mean_age_non_diabetic=Diabetes.Age[Diabetes.Outcome==0].mean()
sd_age_non_diabetic=Diabetes.Age[Diabetes.Outcome==0].std()
mean_age_diabetic=Diabetes.Age[Diabetes.Outcome==1].mean()
sd_age_diabetic=Diabetes.Age[Diabetes.Outcome==1].std()
Diabetes.Age[(Diabetes.Outcome==0)&(Diabetes.Age>7)&(Diabetes.Age<55)].shape
Diabetes.Age[(Diabetes.Outcome==0)&(Diabetes.Age>19)&(Diabetes.Age<31)].shape
Diabetes.Age[(Diabetes.Outcome==1)&(Diabetes.Age>19)&(Diabetes.Age<31)].shape
Prob_non_Diabetic=327/(327+90)
Diabetes.Age[(Diabetes.Outcome==1)&(Diabetes.Age>14)&(Diabetes.Age<60)].shape
x=sns.FacetGrid(Diabetes,col='Outcome')
x=x.map(sns.boxplot,'Age')
sns.swarmplot(Diabetes.Outcome,Diabetes.Age)
plt.scatter(Diabetes.Pregnancies,Diabetes.Glucose)
Most of the women have glucose levels in between 57 to 200 , but as the number of pregnancies increases number of women in the bracket decreases or you can say there are less number of women having pregnancies above 8
lm=sns.lmplot(x='Pregnancies',y='Glucose',data=Diabetes,x_jitter=True)
lm=sns.lmplot(x='Pregnancies',y='Glucose',data=Diabetes,col='Outcome',x_jitter=True)
Glucose Level and No. Of Pregnancies doesnt have perfect linear relationship as they have high errors , hence linear regression line is not the apt best fit line , the relation seems to be more of polynomial relationship
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Glucose,hue=Diabetes.Outcome, x_jitter=True, size=Diabetes.Age)
women whose pregnacy is 7 have more number of diabetic women as compared to others and between the suger level of 99 to 200, most of them are between age of 40 to 60. Most Pregnant Diabetic Women have Glucose level high above120
plt.scatter(Diabetes.Pregnancies,Diabetes.BloodPressure)
sns.lmplot(x="Pregnancies",y="BloodPressure",data=Diabetes)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.BloodPressure,style=Diabetes.Outcome,hue=Diabetes.Insulin,size=Diabetes.Age)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.BloodPressure,style=Diabetes.Outcome,hue=Diabetes.Insulin,size=Diabetes.BMI)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.BloodPressure,style=Diabetes.Outcome,hue=Diabetes.SkinThickness,size=Diabetes.Insulin,x_jitter=True,y_jitter=True)
Majority of women are having blood pressure between 60 to 90 , There is no linear relationship found between Blood Pressure and Pregnancies, also the lmplot shows that Linear Regression line is not the best fit as the errors are huge, it has some polynomial sort of relation. Those who are classified as Diabetic , most of them have insulin level above 100 and are above . It can be said that those who are diabetic mostly have BMI in range of 15-45 . Blood pressure can be maintained while having diabeties . Those who have extreme level of low blood pressure i.e in the range of 40 to 55 are more on lower or no pregnancy side that is between 0 to 3 . This indicates as the number of pregnancy increases either the complication due to low blood pressure increases so they have to due dligently keep their BP in maintainable range .SkinThickness are high of both diabetic and non diabetic and even for women with pregnancy 0 , it states :
1) Women can be obessed 2) Women can be Pregnant 3) Women can have other underlying disease 4) Can be Diabetic 5) Genitically Inherited
It is very peculiar to see that even at the age around 20's women have got upto 14 pregnancies , it states that the tribe comes from some backward community where they are not educated.
plt.scatter(Diabetes.Pregnancies,Diabetes.SkinThickness)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.SkinThickness,hue=Diabetes.Outcome)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.SkinThickness,hue=Diabetes.Outcome,size=Diabetes.Glucose)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.SkinThickness,hue=Diabetes.Outcome,size=Diabetes.Age)
Most of the Diabetic women have higher skin thickness and the number of diabetic women having greater skin thickness increases as the number of pregnancies increases. Also non diabetic women are also there who have high level of skinthickness . It doesnt significantly say that Skinthickness is on higher side because of Diabeties , it can be one of the following reason :
1) Diabeties
2) Obesity
3) Any other disease
Those who have high skinthickness , most of the women have high sugar level, out of which most of them are diabetic
Keeping Outcome Variable asside , the graph suggest as the number of pregnancies increases , women tends to be on higher skinthickness level , it suggest that till the 4 pregnancies women can have skin thickness between 10-20 and there are significant percent of women who falls in that range , but after fourth pregnancies women tend to have skinthickness on higher level.
It's very peculiar that till the fourth pregnancy women with higher skinthickness are of age nearing to 20 , very few of 40 and above, but it comes to fifth and onwards number of pregnancies , there are most women who are 40 and above.
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Insulin)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Insulin,hue=Diabetes.Outcome,x_jitter=True)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Insulin,hue=Diabetes.Outcome,size=Diabetes.BMI,x_jitter=True,y_jitter=True,alpha=.8)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Insulin,hue=Diabetes.Outcome,size=Diabetes.Age,x_jitter=True,y_jitter=True,alpha=.8)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Insulin,hue=Diabetes.Outcome,size=Diabetes.BloodPressure,x_jitter=True,y_jitter=True,alpha=.8)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Insulin,hue=Diabetes.Outcome,size=Diabetes.Glucose,x_jitter=True,y_jitter=True,alpha=.8)
1) As number of pregnancies increases , number of women having insulin level above 100 decreases
2) Majority of Diabetic women are above insulin level 100
3) Women having high insulin value and being diabetic still they are pregnant , it means insulin value and being diabetic isnt much of a complication
4) Most women are having high BMI , i.e they dont fall in idle range of BMI , most diabetic women who have high BMI are having high insulin level that is 100 and above
5) Since we know that number of pregnancies by a women are not usual , they are having pregnancies till 14 , so we can not state the High BMI of women is only because of diabeties, it can be because of the following : a) Number of pregnancies b) Obessity c) Diabetes
6) Mostly of Diabetic women have high insulin with low blood pressure that is 75 and below. There are few whose blood pressure is above 75 and below 120 , there blood pressure is in idle range but they have high insulin level , i.e 150 and above
7) Mostly those women who are diabetic and have high insulin level that is above 100 have high glucose level also i.e 100 and above
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Age)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Age,hue=Diabetes.Outcome)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Age,hue=Diabetes.Outcome,size=Diabetes.Glucose)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Age,hue=Diabetes.Outcome,size=Diabetes.BloodPressure)
sns.scatterplot(Diabetes.Pregnancies,Diabetes.Age,hue=Diabetes.Outcome,size=Diabetes.BMI)
1) As the age increases number of pregnancies by women decreases
2) It is peculiar to see that those women who have pregnancies above 10 they lie in age range of 35-55, which states that they would have gotten pregnant at very early stage of their life
3) Most Diabetic women are above age of 35 also have pregnancies 6 and above
4) Women above age of 30 have high glucose level that is 100 and above, low blood pressure that is 75 and below and most of them are diabetic
5) Most women age above 30 and having number of pregnancies 6 and above are having BMI on higher side that is 30 and above
sns.scatterplot(Diabetes.Glucose,Diabetes.BloodPressure)
sns.scatterplot(Diabetes.Glucose,Diabetes.BloodPressure,hue=Diabetes.Outcome)
sns.scatterplot(Diabetes.Glucose,Diabetes.BloodPressure,hue=Diabetes.Outcome,size=Diabetes.Insulin)
sns.scatterplot(Diabetes.Glucose,Diabetes.BloodPressure,hue=Diabetes.Outcome,size=Diabetes.BMI,x_jitter=True)
sns.scatterplot(Diabetes.Glucose,Diabetes.BloodPressure,hue=Diabetes.Outcome,size=Diabetes.Age,x_jitter=True)
1) There is higher concentration of women having BP in range of 50 to 90 and Glucose in range of 90 to 150
2) Concentration of Diabetic women is more where women have BP between 50 to 90 and Glucose in range of 120-400 , the same area of concentration of diabetic women have majority of women whose insulin level is 100 and above, it states that Glucose , BP and Insulin have some relation , specifically glucose and insulin, as we can see Higher the glucose level higher the insulin
3) Most of women who are diabetic and have higher glucose level that is 120 and above have higher BMI that is 30 and above , so BMI and Glucose tend to have a relation
sns.scatterplot(Diabetes.Glucose,Diabetes.SkinThickness)
sns.scatterplot(Diabetes.Glucose,Diabetes.SkinThickness,hue=Diabetes.Outcome)
sns.scatterplot(Diabetes.Glucose,Diabetes.SkinThickness,hue=Diabetes.Outcome,size=Diabetes.Pregnancies)
sns.scatterplot(Diabetes.Glucose,Diabetes.SkinThickness,hue=Diabetes.Outcome,size=Diabetes.BMI)
sns.scatterplot(Diabetes.Glucose,Diabetes.SkinThickness,hue=Diabetes.Outcome,size=Diabetes.Age)
1) As Glucose level increases , number of women having thicker skinthickness increases
2) Majority of concentration of diabetic women falls above glucose level of 120 and skin thickness 20 and above
3) Majority of diabetic women having pregnancies 5 and above had skin thickness 22.5 and above
4) Mojority of diabetic women having abnormal BMI that is 30 and above had skin thickness 22.5 and above and had glucose level 120 and above
5) Probability of women to have diabeties are very less if their skinthickness is less than 22.5
sns.scatterplot(Diabetes.Glucose,Diabetes.Insulin)
sns.scatterplot(Diabetes.Glucose,Diabetes.Insulin,hue=Diabetes.Outcome)
sns.scatterplot(Diabetes.Glucose,Diabetes.Insulin,hue=Diabetes.Outcome,size=Diabetes.BMI)
sns.scatterplot(Diabetes.Glucose,Diabetes.Insulin,hue=Diabetes.Outcome,size=Diabetes.BloodPressure)
sns.scatterplot(Diabetes.Glucose,Diabetes.Insulin,hue=Diabetes.Outcome,size=Diabetes.Age)
1) Glucose and Insulin shows a linear relationship, i.e with increase in glucose , insulin increases
2) Major concentration of diabetic women are having glucose levels 120 and above , insulin 100 and above
3) There is no definite increase in BMI due to fact of insulin and glucose
4) Majority of diabetic women have blood pressure in control hence , insulin and glucose doesnt have any effect on blood pressure
5) Majority of diabetic women of age 40 and above have Glucose level 120 and above and insulin 100 and above
sns.scatterplot(Diabetes.Glucose,Diabetes.DiabetesPedigreeFunction)
sns.scatterplot(Diabetes.Glucose,Diabetes.DiabetesPedigreeFunction,hue=Diabetes.Outcome)
sns.scatterplot(Diabetes.Glucose,Diabetes.DiabetesPedigreeFunction,hue=Diabetes.Outcome,size=Diabetes.Insulin)
sns.scatterplot(Diabetes.Glucose,Diabetes.DiabetesPedigreeFunction,hue=Diabetes.Outcome,size=Diabetes.Age)
1) Major concentration of women is in section where glucose level range from 80 to 130 and DPF from 0 to .8
2) Majority of Diabetic women lies in section where glucose level is 120 and above and DPF range from 0 to .8
3) The probability of diabetic women to have high insulin level when DPF is .8 and above and Glucose level is 120 and above is more as compared to women having DPF lesser than .8
4) Majority of Diabetic women age 40 and abover are concentrated in section of DPF less than .7 and glucose level 140 and above
sns.scatterplot(Diabetes.Glucose,Diabetes.Age)
sns.scatterplot(Diabetes.Glucose,Diabetes.Age,hue=Diabetes.Outcome)
sns.scatterplot(Diabetes.Glucose,Diabetes.Age,hue=Diabetes.Outcome,size=Diabetes.Insulin)
sns.scatterplot(Diabetes.Glucose,Diabetes.Age,hue=Diabetes.Outcome,size=Diabetes.DiabetesPedigreeFunction)
1) Majority of concentration of women are in section of age less than 32 and glucose level between 80 to 130 but probability of women having diabeties in this section is very ocassionally.
2) Almost all women of all age have diabeties whose glucose levels are 150 and above
3) Glucose is very important attribute to classify women diabetic or not
4) Majority of women having high insulin value ie 100 and above, have glucose level 150 and above and falls under age of 45
sns.scatterplot(Diabetes.BloodPressure,Diabetes.SkinThickness)
sns.scatterplot(Diabetes.BloodPressure,Diabetes.SkinThickness,hue=Diabetes.Outcome)
1) Blood Pressure and Skin Thickness doesnt seems to have any relation
sns.scatterplot(Diabetes.BloodPressure,Diabetes.Insulin)
sns.scatterplot(Diabetes.BloodPressure,Diabetes.Insulin,hue=Diabetes.Outcome)
1) Blood Pressure and Insulin doesnt seems to have any relation
sns.scatterplot(Diabetes.BloodPressure,Diabetes.DiabetesPedigreeFunction)
sns.scatterplot(Diabetes.BloodPressure,Diabetes.DiabetesPedigreeFunction,hue=Diabetes.Outcome)
No Relation Between Blood Pressure & DPF
sns.scatterplot(Diabetes.BloodPressure,Diabetes.Age,hue=Diabetes.Outcome)
sns.scatterplot(Diabetes.SkinThickness,Diabetes.BMI)
sns.scatterplot(Diabetes.SkinThickness,Diabetes.BMI,hue=Diabetes.Outcome)
sns.scatterplot(Diabetes.SkinThickness,Diabetes.BMI,hue=Diabetes.Outcome,size=Diabetes.Glucose)
sns.scatterplot(Diabetes.SkinThickness,Diabetes.BMI,hue=Diabetes.Outcome,size=Diabetes.Insulin)
1) Skin Thickness and BMI have linear relationship where a unit change in skin thickness changes the BMI of women linearly
2) Majority of diabetic women are concentrated in section where skin thickness is 22 and above and BMI 27 and above
3) Women who are diabetic, have BMI 27 or above and skin thickness above or 22 have higher glucose levels i.e 100 and above and also higher insulin levels 100 and above
sns.scatterplot(Diabetes.Insulin,Diabetes.BMI)
sns.lmplot(x='Insulin',y='BMI',data=Diabetes)
sns.scatterplot(Diabetes.Insulin,Diabetes.BMI,hue=Diabetes.Outcome)
1) Insulin and BMI has some sort of linear relationship but not perfect relation
Diab=Diabetes.iloc[:,1:8]
sns.heatmap(Diab.corr(),annot=True)
Few pairs of variables have good significat correlation of all :
1) SkinThickness and BMI
2) Insulin and Glucose
3) BMI and Insulin
4) BMI & BP
5) Age & Glucose
4) BP & Age
The relation between the two variables are both scientifically and statistically proven
sns.pairplot(Diab)
Hence this pair plot justify and summarises the above exploratory and relationship finding methedology. Now we know which variables are having relationship and can be significant in classifying women diabetic or non diabetic . Also using the above findings we can use them to make some sort of rules or associations which will classify women as diabetic or non diabetic.
It was a study on a skewed data hence there must be some affect of bias of any nature
Diabetes.dtypes
Diab.head()
Diab['Thickness_Per_G']=Diab.SkinThickness/Diab.Glucose
Since SkinThickness and BMI are correlated , which is because skin thickness is indirectly a part of BMI as skin thickness states the thickness of the skin which is basicaly a mass index , and thickness of skin is due to obesity as one of the reason, hence obessed people must have high Glucose or sugar level, hence a new variable is being derived that is skinthickness per glucose level.
Earlier Glucose and BMI were not directly correlated but now Thickness_Per_G and BMI have a good correlation , which you can see below in the heat map.
Theoretically as BMI increases , insulin resistance also increases which results in increased blood glucose level.
sns.heatmap(Diab.corr(),annot=True)
sns.scatterplot(Diab.BMI,Diab.Thickness_Per_G,hue=Diabetes.Outcome, size=Diab.Insulin)
As BMI increases , Thickness per glucose level also increases
Majority diabetic women have BMI 30 or above and Thickness per Glucose level is .4 or below
It is very rare that person with thickness per glucose be higher than .6 and still be diabetic as there glucose level will be less and so does insulin level
sns.scatterplot(Diab.Insulin,Diab.Thickness_Per_G)